# ============================================================
# Home Data for ML Course (Kaggle Learn Users)
# CatBoost 改良版（A+B+C 全部入り）
#
# 目的：Public LB RMSE を 12000 に近づける（小さいほど良い）
#
# A) 学習ロスを MAE にする（外れ値に引っ張られにくくしてRMSEが下がることがある）
#    - loss_function="MAE"
#    - eval_metric="RMSE"  （評価はRMSE）
#
# B) depth を 10 にする（非線形・相互作用をより拾う）
#
# C) iterations を増やし、early stopping に任せる
#    - iterations=50000
#    - learning_rate=0.02
#    - od_wait=500
#
# さらに効きやすい定番として：
#  - 外れ値除去（超広いのに安い）
#  - 軽量な派生特徴量（TotalSF, HouseAge, RemodAge, TotalBath, Qual×Area）
#
# 実行方法：
#  - Kaggle Notebook の1セルにコピペして実行
#  - submission.csv が生成されるので提出
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor, Pool


# ------------------------------------------------------------
# 1) Load
# ------------------------------------------------------------
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test  = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# ------------------------------------------------------------
# 2) Outlier removal（効くことが多い）
# ------------------------------------------------------------
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

y = train["SalePrice"].copy()
X = train.drop(columns=["SalePrice"]).copy()
X_test = test.copy()

# ------------------------------------------------------------
# 3) Feature engineering（軽量だが効きやすいものだけ）
# ------------------------------------------------------------
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 面積系：欠損は「ない=0」が自然
    for c in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    # 合計面積（強い）
    df["TotalSF"] = df.get("TotalBsmtSF", 0) + df.get("1stFlrSF", 0) + df.get("2ndFlrSF", 0)

    # 築年数・改築後年数（強い）
    if "YrSold" in df.columns and "YearBuilt" in df.columns:
        df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    if "YrSold" in df.columns and "YearRemodAdd" in df.columns:
        df["RemodAge"] = df["YrSold"] - df["YearRemodAdd"]

    # 風呂合計（地味に効く）
    for c in ["FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)
    if all(c in df.columns for c in ["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]):
        df["TotalBath"] = df["FullBath"] + 0.5*df["HalfBath"] + df["BsmtFullBath"] + 0.5*df["BsmtHalfBath"]

    # 品質×面積（強い）
    if "OverallQual" in df.columns and "GrLivArea" in df.columns:
        df["Qual_x_GrLivArea"] = df["OverallQual"] * df["GrLivArea"]
    if "OverallQual" in df.columns:
        df["Qual_x_TotalSF"] = df["OverallQual"] * df["TotalSF"]

    return df

X = add_features(X)
X_test = add_features(X_test)

# ------------------------------------------------------------
# 4) Categorical columns for CatBoost
# ------------------------------------------------------------
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
print("Categorical columns:", len(cat_cols))

# ------------------------------------------------------------
# 5) Minimal missing handling（堅牢化）
# ------------------------------------------------------------
# CatBoostは欠損に強いが、objectのNaNは明示的に文字で埋めた方が安全
X[cat_cols] = X[cat_cols].fillna("Missing")
X_test[cat_cols] = X_test[cat_cols].fillna("Missing")

# 数値欠損は中央値で補完（trainの中央値でtestも埋める）
num_cols = X.columns.difference(cat_cols).tolist()
med = X[num_cols].median()
X[num_cols] = X[num_cols].fillna(med)
X_test[num_cols] = X_test[num_cols].fillna(med)

# ------------------------------------------------------------
# 6) Target transform：log1p（あなたの13291系と同じ方向性）
# ------------------------------------------------------------
y_log = np.log1p(y)

# ------------------------------------------------------------
# 7) KFold training + averaging predictions
# ------------------------------------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oof_log = np.zeros(len(X))
test_pred = np.zeros(len(X_test))

for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    valid_pool = Pool(X_va, y_va, cat_features=cat_idx)
    test_pool  = Pool(X_test, cat_features=cat_idx)

    # =========================================================
    # CatBoost settings (A+B+C)
    # A) loss_function="MAE"   ← 学習をMAE寄りにして外れ値耐性
    # B) depth=10              ← より複雑な相互作用を拾う
    # C) iterations多め + early stopping（od_wait）に任せる
    # =========================================================
    model = CatBoostRegressor(
        loss_function="MAE",      # A
        eval_metric="RMSE",       # 評価はRMSE
        iterations=50000,         # C
        learning_rate=0.02,       # C
        depth=10,                 # B
        l2_leaf_reg=3.0,
        random_seed=42,
        od_type="Iter",
        od_wait=500,              # C：500回改善しなければ停止
        verbose=300               # 進捗（うるさければ 0）
    )

    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    # --- validation ---
    pred_va_log = model.predict(valid_pool)
    oof_log[va_idx] = pred_va_log

    # fold RMSE in price space（提出スコアと同じ単位で確認）
    pred_va = np.expm1(pred_va_log)
    y_va_price = np.expm1(y_va)
    fold_rmse = mean_squared_error(y_va_price, pred_va, squared=False)
    print(f"[Fold {fold}] RMSE(price): {fold_rmse:.2f}")

    # --- test prediction (average) ---
    pred_test_log = model.predict(test_pool)
    test_pred += np.expm1(pred_test_log) / kf.n_splits

# Overall CV RMSE (price space)
cv_rmse = mean_squared_error(np.expm1(y_log), np.expm1(oof_log), squared=False)
print(f"\n[CV] RMSE(price): {cv_rmse:.2f}")

# ------------------------------------------------------------
# 8) Submission
# ------------------------------------------------------------
submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": test_pred
})
submission.to_csv("submission.csv", index=False)
print("✅ saved: submission.csv")
